Library

library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(ecodist)
library(hierfstat)
library(gplots)
library(LEA)
library(lfmm)
library(maps)
library(mapplots)
library(MASS)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rworldmap)
library(scales)
library(SeqArray)
library(SeqVarTools)
library(shape)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)
library(xtable)
library(PBSmapping)

Session information

R.version
                 _                           
  platform       x86_64-apple-darwin17.0     
  arch           x86_64                      
  os             darwin17.0                  
  system         x86_64, darwin17.0          
  status                                     
  major          4                           
  minor          2.2                         
  year           2022                        
  month          10                          
  day            31                          
  svn rev        83211                       
  language       R                           
  version.string R version 4.2.2 (2022-10-31)
  nickname       Innocent and Trusting
sessionInfo()
  R version 4.2.2 (2022-10-31)
  Platform: x86_64-apple-darwin17.0 (64-bit)
  Running under: macOS Catalina 10.15.7
  
  Matrix products: default
  BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
  LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
  
  locale:
  [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
  
  attached base packages:
  [1] grid      stats     graphics  grDevices utils     datasets  methods  
  [8] base     
  
  other attached packages:
   [1] PBSmapping_2.73.2     xtable_1.8-4          vegan_2.6-4          
   [4] lattice_0.21-8        permute_0.9-7         vcfR_1.14.0          
   [7] stringr_1.5.0         SNPRelate_1.32.2      shape_1.4.6          
  [10] SeqVarTools_1.36.0    SeqArray_1.38.0       gdsfmt_1.34.1        
  [13] scales_1.2.1          rworldmap_1.3-6       reshape_0.8.9        
  [16] raster_3.6-20         sp_1.6-0              randomcoloR_1.1.0.1  
  [19] qvalue_2.30.0         poppr_2.9.4           pegas_1.2            
  [22] ape_5.7-1             MASS_7.3-59           mapplots_1.5.1       
  [25] maps_3.4.1            lfmm_1.1              LEA_3.10.2           
  [28] gplots_3.1.3          hierfstat_0.5-11      ecodist_2.0.9        
  [31] dartR_2.7.2           dartR.data_1.0.2      dplyr_1.1.2          
  [34] ggplot2_3.4.2         data.table_1.14.8     ComplexHeatmap_2.14.0
  [37] car_3.1-2             carData_3.0-5         adegenet_2.1.10      
  [40] ade4_1.7-22           rmarkdown_2.21       
  
  loaded via a namespace (and not attached):
    [1] utf8_1.2.3             R.utils_2.12.2         tidyselect_1.2.0      
    [4] combinat_0.0-8         Rtsne_0.16             maptools_1.1-6        
    [7] StAMPP_1.6.3           GWASExactHW_1.01       munsell_0.5.0         
   [10] codetools_0.2-19       withr_2.5.0            colorspace_2.1-0      
   [13] Biobase_2.58.0         knitr_1.42             stats4_4.2.2          
   [16] RgoogleMaps_1.4.5.3    logistf_1.24.1         GenomeInfoDbData_1.2.9
   [19] gap.datasets_0.0.5     vctrs_0.6.2            generics_0.1.3        
   [22] xfun_0.39              R6_2.5.1               doParallel_1.0.17     
   [25] GenomeInfoDb_1.34.9    clue_0.3-64            fields_14.1           
   [28] bitops_1.0-7           cachem_1.0.8           promises_1.2.0.1      
   [31] pinfsc50_1.2.0         gtable_0.3.3           formula.tools_1.7.1   
   [34] spam_2.9-1             rlang_1.1.1            calibrate_1.7.7       
   [37] GlobalOptions_0.1.2    splines_4.2.2          rgdal_1.6-6           
   [40] broom_1.0.4            yaml_2.3.7             reshape2_1.4.4        
   [43] abind_1.4-5            backports_1.4.1        httpuv_1.6.9          
   [46] tools_4.2.2            ellipsis_0.3.2         jquerylib_0.1.4       
   [49] RColorBrewer_1.1-3     BiocGenerics_0.44.0    Rcpp_1.0.10           
   [52] plyr_1.8.8             zlibbioc_1.44.0        purrr_1.0.1           
   [55] RCurl_1.98-1.12        GetoptLong_1.0.5       viridis_0.6.3         
   [58] S4Vectors_0.36.2       cluster_2.1.4          magrittr_2.0.3        
   [61] genetics_1.3.8.1.3     circlize_0.4.15        mvtnorm_1.1-3         
   [64] matrixStats_0.63.0     patchwork_1.1.2        mime_0.12             
   [67] evaluate_0.20          IRanges_2.32.0         gridExtra_2.3         
   [70] compiler_4.2.2         tibble_3.2.1           mice_3.15.0           
   [73] KernSmooth_2.23-20     V8_4.3.0               crayon_1.5.2          
   [76] gdistance_1.6.2        R.oo_1.25.0            htmltools_0.5.5       
   [79] mgcv_1.8-42            later_1.3.0            tidyr_1.3.0           
   [82] DBI_1.1.3              PopGenReport_3.0.7     boot_1.3-28.1         
   [85] Matrix_1.5-4           cli_3.6.1              R.methodsS3_1.8.2     
   [88] gdata_2.18.0.1         parallel_4.2.2         dotCall64_1.0-2       
   [91] igraph_1.4.2           GenomicRanges_1.50.2   pkgconfig_2.0.3       
   [94] foreign_0.8-84         terra_1.7-29           foreach_1.5.2         
   [97] bslib_0.4.2            XVector_0.38.0         digest_0.6.31         
  [100] polysat_1.7-7          Biostrings_2.66.0      operator.tools_1.6.3  
  [103] curl_5.0.0             gap_1.5-1              shiny_1.7.4           
  [106] gtools_3.9.4           rjson_0.2.21           lifecycle_1.0.3       
  [109] nlme_3.1-162           dismo_1.3-9            jsonlite_1.8.4        
  [112] seqinr_4.2-30          viridisLite_0.4.2      fansi_1.0.4           
  [115] pillar_1.9.0           GGally_2.1.2           fastmap_1.1.1         
  [118] glue_1.6.2             mmod_1.3.3             png_0.1-8             
  [121] iterators_1.0.14       stringi_1.7.12         sass_0.4.5            
  [124] caTools_1.18.2



Quality control and SNP isolation

## load meta-data file

samps <- fread("samps_10Nov2020.csv")

## open GDS for common SNPs (PoolSNP)

genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)

## common SNP.dt

snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
                     pos = seqGetData(genofile, "position"),
                     nAlleles = seqGetData(genofile, "$num_allele"),
                     id = seqGetData(genofile, "variant.id"),
                     genotype = seqGetData(genofile, "allele"))

snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)
## # of selected variants: 4,281,164
## filter to target

snp.tmp <- data.table(chr ="2L", pos = 3622074:3656953)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)
## # of selected variants: 1,613
## get annotations

message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data

snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))

## Extract data between the 2nd and third | symbol

snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]

## Collapse additional annotations to original SNP vector length

snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]

snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]

## get frequencies

message("Allele Freqs")

ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")

af <- data.table(ad = expand.grid(ad$data)[,1],
                 dp = expand.grid(dp)[,1],
                 sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
                 variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))

## merge them together

message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")

afi[ , af := ad/dp]

## calculate effective read-depth

afis <- merge(afi, samps, by = "sampleId")

afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]


## subsetting dataset

names(afis)
##   [1] "sampleId"       "variant.id"     "ad"             "dp"            
##   [5] "n"              "col"            "gene"           "chr"           
##   [9] "pos"            "nAlleles"       "genotype"       "af"            
##  [13] "locality"       "country"        "city"           "collectionDate"
##  [17] "DateExact"      "lat"            "long"           "season"        
##  [21] "type"           "continent"      "set"            "nFlies"        
##  [25] "SRA_accession"  "SRA_experiment" "Model"          "collector"     
##  [29] "sampleType"     "year"           "yday"           "stationId"     
##  [33] "dist_km"        "In(2L)t"        "In(2R)Ns"       "In(3L)P"       
##  [37] "In(3R)C"        "In(3R)K"        "In(3R)Mo"       "In(3R)Payne"   
##  [41] "status"         "bio1"           "bio2"           "bio3"          
##  [45] "bio4"           "bio5"           "bio6"           "bio7"          
##  [49] "bio8"           "bio9"           "bio10"          "bio11"         
##  [53] "bio12"          "bio13"          "bio14"          "bio15"         
##  [57] "bio16"          "bio17"          "bio18"          "bio19"         
##  [61] "tmin1"          "tmin2"          "tmin3"          "tmin4"         
##  [65] "tmin5"          "tmin6"          "tmin7"          "tmin8"         
##  [69] "tmin9"          "tmin10"         "tmin11"         "tmin12"        
##  [73] "tmax1"          "tmax2"          "tmax3"          "tmax4"         
##  [77] "tmax5"          "tmax6"          "tmax7"          "tmax8"         
##  [81] "tmax9"          "tmax10"         "tmax11"         "tmax12"        
##  [85] "prec1"          "prec2"          "prec3"          "prec4"         
##  [89] "prec5"          "prec6"          "prec7"          "prec8"         
##  [93] "prec9"          "prec10"         "prec11"         "prec12"        
##  [97] "propSimNorm"    "sex"            "nEff"           "af_nEff"
season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)
## 'data.frame':    438736 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ sampleId  : chr  "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
##  $ locality  : chr  "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
##  $ long      : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ lat       : num  48.4 48.4 48.4 48.4 48.4 ...
##  $ season    : chr  "spring" "spring" "spring" "spring" ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  1 0 0.1569 0.0196 NA ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  28 26 31 31 NA 36 36 33 24 30 ...
head(season.dat)
##     type year     sampleId locality country        city  long    lat season
## 1 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
##   variant.id genotype         af                 col continent nEff
## 1     162333      A,C 1.00000000 3_prime_UTR_variant    Europe   28
## 2     162334      A,C 0.00000000 3_prime_UTR_variant    Europe   26
## 3     162335      C,T 0.15686275 3_prime_UTR_variant    Europe   31
## 4     162336      T,C 0.01960784 3_prime_UTR_variant    Europe   31
## 5     162337      C,T         NA 3_prime_UTR_variant    Europe   NA
## 6     162338      C,T 0.00000000 3_prime_UTR_variant    Europe   36
dim(season.dat)
## [1] 438736     15
season.dat$season[season.dat$season == "frost"] <- "fall"

season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()

localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")

## getting samples collected at least once during spring and winter

for(loc in localities){
    dft <- season.dat[season.dat$locality == loc, ]
    if(all(seasons %in% unique(dft$season))){
        season.filter.dat <- rbind(season.filter.dat, dft)
    }
}

dim(season.filter.dat)
## [1] 285501     15
dim(na.omit(season.filter.dat))
## [1] 269454     15
str(season.filter.dat)
## 'data.frame':    285501 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ locality  : chr  "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ long      : num  16.4 16.4 16.4 16.4 16.4 ...
##  $ lat       : num  48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...
mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)
## [1] "sampleId"   "city"       "country"    "season"     "variant.id"
## [6] "continent"  "af"         "nEff"
str(mat1)
## 'data.frame':    285501 obs. of  8 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...
dim(mat1)
## [1] 285501      8
mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)
## [1] 269454      8
mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)
## [1] 212017      8
mat1 <- mat1[ , -8]
str(mat1)
## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...
## Changing name of cities

mat1$city <- as.factor(mat1$city)
levels(mat1$city)
##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet \xe0 Gobet"           "Charlottesville"            
##  [9] "Charlotttesville"            "Chornobyl (Cooling pond)"   
## [11] "Chornobyl Applegarden"       "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi"           "Chornobyl Polisske"         
## [15] "Chornobyl Yaniv"             "Cross Plains"               
## [17] "Drogobych"                   "Esparto"                    
## [19] "Gimenells"                   "Gotheron"                   
## [21] "Gross-Enzersdorf"            "Homestead"                  
## [23] "Ithaca"                      "Kalna"                      
## [25] "Karensminde"                 "Karensminde Orchard"        
## [27] "Kharkiv"                     "Kyiv"                       
## [29] "Kyiv (Vyshneve)"             "Lancaster"                  
## [31] "Linvilla"                    "Market Harborough"          
## [33] "Mauternbach"                 "Munich"                     
## [35] "Odesa"                       "Odessa"                     
## [37] "Sheffield"                   "Slankamen. Vinogradi"       
## [39] "State College"               "Sudbury"                    
## [41] "Topeka"                      "Tuolumne"                   
## [43] "valday"                      "Valday"                     
## [45] "Viltain"                     "Yesiloz"                    
## [47] "Yesiloz TR13"                "Yesiloz TR14"               
## [49] "Yesiloz TR15"                "Yesiloz TR16"               
## [51] "Yesiloz TR17"                "Yesiloz TR18"
levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))

levels(mat1$city)
##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet-A-Gobet"              "Charlottesville"            
##  [9] "Chornobyl (Cooling pond)"    "Chornobyl Applegarden"      
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"          
## [13] "Chornobyl Polisske"          "Chornobyl Yaniv"            
## [15] "Cross Plains"                "Drogobych"                  
## [17] "Esparto"                     "Gimenells"                  
## [19] "Gotheron"                    "Gross-Enzersdorf"           
## [21] "Homestead"                   "Ithaca"                     
## [23] "Kalna"                       "Karensminde"                
## [25] "Kharkiv"                     "Kyiv"                       
## [27] "Kyiv (Vyshneve)"             "Lancaster"                  
## [29] "Linvilla"                    "Market Harborough"          
## [31] "Mauternbach"                 "Munich"                     
## [33] "Odessa"                      "Sheffield"                  
## [35] "Slankamen-Vinogradi"         "State College"              
## [37] "Sudbury"                     "Topeka"                     
## [39] "Tuolumne"                    "Valday"                     
## [41] "Viltain"                     "Yesiloz"
levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)
##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Kyiv (Vyshneve)"     "Lancaster"           "Linvilla"           
## [25] "Market Harborough"   "Mauternbach"         "Munich"             
## [28] "Odessa"              "Sheffield"           "Slankamen-Vinogradi"
## [31] "State College"       "Sudbury"             "Topeka"             
## [34] "Tuolumne"            "Valday"              "Viltain"            
## [37] "Yesiloz"
levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)
##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Lancaster"           "Linvilla"            "Market Harborough"  
## [25] "Mauternbach"         "Munich"              "Odessa"             
## [28] "Sheffield"           "Slankamen-Vinogradi" "State College"      
## [31] "Sudbury"             "Topeka"              "Tuolumne"           
## [34] "Valday"              "Viltain"             "Yesiloz"
levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)
##  [1] "Akaa"                    "Athens"                 
##  [3] "Barcelona"               "Benton Harbor"          
##  [5] "Broggingen"              "Brzezina"               
##  [7] "Chalet-A-Gobet"          "Charlottesville"        
##  [9] "Chornobyl"               "Cross Plains"           
## [11] "Drogobych"               "Esparto"                
## [13] "Gimenells"               "Gotheron"               
## [15] "Gross-Enzersdorf"        "Homestead"              
## [17] "Ithaca"                  "Kalna"                  
## [19] "Karensminde"             "Kharkiv"                
## [21] "Kyiv"                    "Lancaster"              
## [23] "Linvilla"                "Market Harborough"      
## [25] "Mauternbach"             "Munich"                 
## [27] "Odessa"                  "Sheffield"              
## [29] "Slankamenacki-Vinogradi" "State College"          
## [31] "Sudbury"                 "Topeka"                 
## [33] "Tuolumne"                "Valday"                 
## [35] "Viltain"                 "Yesiloz"
dim(mat1)
## [1] 212017      7
str(mat1)
## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...
mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses

mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]
##              sampleId             city country season continent af.162333
## 9679    AT_gr_12_fall Gross-Enzersdorf Austria   fall    Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring    Europe 0.8888889
## 1        AT_Mau_14_01      Mauternbach Austria spring    Europe 1.0000000
## 1617     AT_Mau_14_02      Mauternbach Austria   fall    Europe        NA
## 3227     AT_Mau_15_50      Mauternbach Austria spring    Europe 0.7714286
## 4840     AT_Mau_15_51      Mauternbach Austria   fall    Europe 0.6984127
##       af.162335  af.162336  af.162337 af.162338
## 9679  0.2272727 0.00000000 0.02439024         0
## 11292        NA 0.00000000         NA        NA
## 1     0.1568627 0.01960784         NA         0
## 1617         NA 0.00000000 0.00000000         0
## 3227  0.1323529 0.00000000 0.00000000         0
## 4840  0.2656250 0.01282051 0.00000000         0
dim(mat2)
## [1]  170 1614
mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)

mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]

usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")

for(i in usa){
    mat2$country[mat2$city == i] <- i
}

mat2$country <- as.factor(mat2$country)

levels(mat2$country)
##  [1] "Austria"         "Benton Harbor"   "Charlottesville" "Cross Plains"   
##  [5] "Denmark"         "Esparto"         "Finland"         "France"         
##  [9] "Germany"         "Ithaca"          "Lancaster"       "Linvilla"       
## [13] "Russia"          "Spain"           "State College"   "Tuolumne"       
## [17] "Turkey"          "Ukraine"
levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "Ithaca"          "Lancaster"       "Linvilla"        "Russia"         
## [13] "Spain"           "State College"   "Tuolumne"        "Turkey"         
## [17] "Ukraine"
levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "Linvilla"        "Russia"          "Spain"          
## [13] "State College"   "Tuolumne"        "Turkey"          "Ukraine"
levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"
mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]
##              162333 162334 162335 162336 162337
## AT_Mau_14_01    C/C    A/A    C/T    T/C   <NA>
## AT_Mau_14_02    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_50    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_51    A/C   <NA>    C/T    T/C    C/C
## AT_See_14_44    A/C    A/C    C/T    T/T    C/C
for(i in 1:ncol(mat3)){
    mat3[ , i] <- gsub("/", "", mat3[, i])
}

mat3[1:5, 1:5]
##              162333 162334 162335 162336 162337
## AT_Mau_14_01     CC     AA     CT     TC   <NA>
## AT_Mau_14_02     AC     AA     CT     TT     CC
## AT_Mau_15_50     AC     AA     CT     TT     CC
## AT_Mau_15_51     AC   <NA>     CT     TC     CC
## AT_See_14_44     AC     AC     CT     TT     CC
mat3$ind <- rownames(mat3)
dim(mat3)
## [1]  272 1614
mat3[1:5, 1610:1614]
##              163994 163995 163996 163997          ind
## AT_Mau_14_01     GA     GG     GC     AA AT_Mau_14_01
## AT_Mau_14_02     GA     GG     GC     AT AT_Mau_14_02
## AT_Mau_15_50     GA     GG     GC     AA AT_Mau_15_50
## AT_Mau_15_51     GA     GG     GC     AA AT_Mau_15_51
## AT_See_14_44     GA     GA     GC     AA AT_See_14_44
mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)
## [1]  152 1614
mat4[1:5, 1600:1614]
##               163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01      AA     AG     TT     CG     AG     CC     AC     CC     AG
## AT_Mau_14_02      AA     AG     TT     CG     AG     CC     AC     CC     AA
## AT_Mau_15_50      AA     AG     TC     CG     AG     CT     AC     CG     AG
## AT_Mau_15_51      AA     AG     TT     CG     AG     CC     AC     CG     AA
## AT_gr_12_fall     AA     AG     TT     CG     AG     CT     AC     CG     AA
##               163993 163994 163995 163996 163997           ind
## AT_Mau_14_01      TC     GA     GG     GC     AA  AT_Mau_14_01
## AT_Mau_14_02      TC     GA     GG     GC     AT  AT_Mau_14_02
## AT_Mau_15_50      TC     GA     GG     GC     AA  AT_Mau_15_50
## AT_Mau_15_51      TC     GA     GG     GC     AA  AT_Mau_15_51
## AT_gr_12_fall     TC     GA     GG     GC     AA AT_gr_12_fall
same_ord <- mat2[ , c(1, 2, 3, 4)]
colnames(same_ord) <- c("ind", "city", "country", "season")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)
## [1]  152 1617
mat5[1:5, 1:5]
##               ind 162333 162334 162335 162336
## 1   AT_gr_12_fall     AC   <NA>     CT     TT
## 2 AT_gr_12_spring     AC     AA     CT     TT
## 3    AT_Mau_14_01     CC     AA     CT     TC
## 4    AT_Mau_14_02     AC     AA     CT     TT
## 5    AT_Mau_15_50     AC     AA     CT     TT
rownames(mat5) <- mat5[ , 1]
dim(mat5)
## [1]  152 1617
mat5[1:5, 1610:1617]
##                 163993 163994 163995 163996 163997             city country
## AT_gr_12_fall       TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_gr_12_spring     TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_Mau_14_01        TC     GA     GG     GC     AA      Mauternbach Austria
## AT_Mau_14_02        TC     GA     GG     GC     AT      Mauternbach Austria
## AT_Mau_15_50        TC     GA     GG     GC     AA      Mauternbach Austria
##                 season
## AT_gr_12_fall     fall
## AT_gr_12_spring spring
## AT_Mau_14_01    spring
## AT_Mau_14_02      fall
## AT_Mau_15_50    spring
mat6 <- mat5[ , -c(1, 1615, 1616, 1617)]
dim(mat6)
## [1]  152 1613
mat6[1:5, 1:5]
##                 162333 162334 162335 162336 162337
## AT_gr_12_fall       AC   <NA>     CT     TT     CT
## AT_gr_12_spring     AC     AA     CT     TT     CC
## AT_Mau_14_01        CC     AA     CT     TC   <NA>
## AT_Mau_14_02        AC     AA     CT     TT     CC
## AT_Mau_15_50        AC     AA     CT     TT     CC
mat6[1:5, 1605:1613]
##                 163988 163989 163991 163992 163993 163994 163995 163996 163997
## AT_gr_12_fall       CT     AC     CG     AA     TC     GA     GG     GC     AA
## AT_gr_12_spring     CC     AA     CG     AA     TC     GA     GG     GC     AA
## AT_Mau_14_01        CC     AC     CC     AG     TC     GA     GG     GC     AA
## AT_Mau_14_02        CC     AC     CC     AA     TC     GA     GG     GC     AT
## AT_Mau_15_50        CT     AC     CG     AG     TC     GA     GG     GC     AA
fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen
## /// GENIND OBJECT /////////
## 
##  // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  152 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country, 
##     ploidy = 2)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
fly_gen$tab[1:5, 1:5]
##                 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall          1        1       NA       NA        1
## AT_gr_12_spring        1        1        2        0        1
## AT_Mau_14_01           0        2        2        0        1
## AT_Mau_14_02           1        1        2        0        1
## AT_Mau_15_50           1        1        2        0        1
summary(fly_gen$pop)
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              18 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]
## PA_li_10_spring 
##       0.7123373
barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
## quality control begins here

mlg(fly_gen) ## keep only polymorphic loci
## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################
## [1] 151
isPoly(fly_gen) %>% summary
##    Mode   FALSE    TRUE 
## logical       1    1612
poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary
##    Mode    TRUE 
## logical    1612
fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]

n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3224 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3224 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
indmiss_fly2 <- propTyped(fly_gen2, by = "ind")

barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes
##    162334    162396    162412    162545    162546    162547    162549    162551 
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795 
##    162553    162554    162555    162557    162560    162563    162587    162623 
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013 
##    162746    162755    162756    162882    162892    162986    162987    162988 
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768 
##    162989    162990    162992    163029    163072    163073    163074    163075 
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437 
##    163076    163118    163119    163120    163121    163122    163123    163124 
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940 
##    163125    163307    163363    163365    163366    163608    163648    163650 
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815 
##    163692    163715    163716    163717    163765    163821    163908    163910 
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139 
##    163914    163915    163916    163917    163922    163958 
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868
barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]
## named numeric(0)
fly_gen3$tab[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
summary(fly_gen3$pop)
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
mlg(fly_gen3)
## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################
## [1] 151
isPoly(fly_gen3) %>% summary
##    Mode    TRUE 
## logical    1550
poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary
##    Mode    TRUE 
## logical    1550
barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

## computing basic stats

basic_fly <- basic.stats(fly_gen3, diploid = TRUE)

## testing for H-W equilibrium

## Chi-squared test: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 0)[ , 3]))


HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]
##                     162333     162335    162336    162337    162338
## Austria         0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI           0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark         0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto         0.23013934 0.04550026 1.0000000 0.7750970 0.8037847
## Monte Carlo: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))

HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]
##                 162333 162335 162336 162337 162338
## Austria          0.398  0.092      1      1      1
## MI.WI            0.425  0.169      1      1      1
## Charlottesville  1.000  0.132      1      1      1
## Denmark          1.000  0.131      1      1      1
## Esparto          1.000  0.330      1      1      1
alpha <- 0.05

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))

Prop.loci.out.of.HWE[1:10, ]
##            Chisq        MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000
## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests

Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"), 
                    nrow = nrow(HWE.test.chisq))

MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"), 
                    nrow = nrow(HWE.test.MC))

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean), 
           MC = apply(HWE.test.MC<alpha, 2, mean),
           Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
           MC.fdr = apply(MC.fdr<alpha, 2, mean))

head(Prop.loci.out.of.HWE)
##            Chisq        MC Chisq.fdr     MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.06666667
## 162335 1.0000000 0.4000000 0.6000000 0.33333333
## 162336 0.0000000 0.0000000 0.0000000 0.00000000
## 162337 0.0000000 0.0000000 0.0000000 0.00000000
## 162338 0.0000000 0.0000000 0.0000000 0.00000000
## 162339 0.0000000 0.0000000 0.0000000 0.00000000
loci <- data.frame()

idx <- 1

for(i in 1:nrow(Prop.loci.out.of.HWE)){
    fdr <- Prop.loci.out.of.HWE[i , ]
    if(fdr$MC.fdr > 0.5){
        loci <- rbind(fdr, loci)
    }

}

loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.
## data frame with 0 columns and 0 rows
dim(loci)
## [1] 0 0
## check for linkage disequilibrium (LD)

LD <- as.genclone(fly_gen3)
LD
## 
## This is a genclone object
## -------------------------
## Genotype information:
## 
##     151 original multilocus genotypes 
##     151 diploid individuals
##    1550 codominant loci
## 
## Population information:
## 
##       0 strata. 
##      15 populations defined - 
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine
quartz()
LD.general <- poppr::ia(LD, sample = 500)

LD.general
##           Ia         p.Ia        rbarD         p.rD 
## 15.592940300  0.001996008  0.013771123  0.001996008



Computing pairwise Fst test

fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)

fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]
##                 Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria           0.000 0.053           0.058   0.018   0.056   0.015  0.021
## MI-WI             0.053 0.000           0.005   0.049   0.012   0.049  0.042
## Charlottesville   0.058 0.005           0.000   0.054   0.017   0.057  0.045
## Denmark           0.018 0.049           0.054   0.000   0.049   0.024  0.018
## Esparto           0.056 0.012           0.017   0.049   0.000   0.054  0.041
## Finland           0.015 0.049           0.057   0.024   0.054   0.000  0.022
## France            0.021 0.042           0.045   0.018   0.041   0.022  0.000
## Germany           0.012 0.041           0.046   0.016   0.043   0.013  0.010
## NY-MA             0.044 0.006           0.003   0.040   0.009   0.044  0.033
## PA                0.053 0.004           0.004   0.050   0.013   0.051  0.044
##                 Germany NY-MA    PA
## Austria           0.012 0.044 0.053
## MI-WI             0.041 0.006 0.004
## Charlottesville   0.046 0.003 0.004
## Denmark           0.016 0.040 0.050
## Esparto           0.043 0.009 0.013
## Finland           0.013 0.044 0.051
## France            0.010 0.033 0.044
## Germany           0.000 0.034 0.041
## NY-MA             0.034 0.000 0.002
## PA                0.041 0.002 0.000
tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object
## Starting gi2gl 
## Starting gl.compliance.check 
##   Processing genlight object with SNP data
##   Checking coding of SNPs
##     SNP data scored NA, 0, 1 or 2 confirmed
##   Checking locus metrics and flags
##   Recalculating locus metrics
##   Checking for monomorphic loci
##     No monomorphic loci detected
##   Checking for loci with all missing data
##     No loci with all missing data detected
##   Checking whether individual names are unique.
##   Checking for individual metrics
##   Warning: Creating a slot for individual metrics
##   Checking for population assignments
##     Population assignments confirmed
##   Spelling of coordinates checked and changed if necessary to 
##             lat/lon
## Completed: gl.compliance.check 
## Completed: gi2gl
pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)
## Starting gl.fst.pop 
##   Processing genlight object with SNP data
## Completed: gl.fst.pop
fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst
##                    Austria     Esparto    Tuolumne     Germany    Denmark
## Austria         0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto         0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne        0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany         0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark         0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain           0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland         0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France          0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA           0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI           0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA              0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia          0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey          0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine         0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
##                       Spain    Finland     France       NY-MA       MI-WI
## Austria         0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto         0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne        0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany         0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark         0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain           0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland         0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France          0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA           0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI           0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA              0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia          0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey          0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine         0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
##                          PA      Russia     Turkey     Ukraine Charlottesville
## Austria         0.053102204 0.016899657 0.01412191 0.014163086     0.058198201
## Esparto         0.013471651 0.053516381 0.06054132 0.063774691     0.017272103
## Tuolumne        0.009573028 0.060654501 0.06292115 0.069481577     0.015222898
## Germany         0.041313299 0.020656074 0.01890040 0.020327235     0.046035247
## Denmark         0.050159598 0.023328359 0.02179383 0.023697489     0.054027742
## Spain           0.038958914 0.031639501 0.02680916 0.031831900     0.046471038
## Finland         0.051458012 0.019135166 0.01677821 0.022452499     0.057285510
## France          0.043675456 0.021999335 0.02823027 0.028812985     0.044509519
## NY-MA           0.001936861 0.047419819 0.04935836 0.055798661     0.002951612
## MI-WI           0.004351338 0.052957069 0.05430445 0.064183546     0.004506388
## PA              0.000000000 0.058255707 0.05610334 0.066041832     0.004019586
## Russia          0.058255707 0.000000000 0.01717863 0.007515034     0.059645911
## Turkey          0.056103340 0.017178630 0.00000000 0.012939846     0.058036081
## Ukraine         0.066041832 0.007515034 0.01293985 0.000000000     0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473     0.000000000



## png("heatmap.png", width = 7, height = 7, units = "in", res = 300)

## pdf("heatmap.pdf")

my_palette <- colorRampPalette(c("blue", "red"))(n = 100)

heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette, symbreaks = TRUE)

## dev.off()

Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.



Genetic admixture proportions

##seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)
##outest <- read.vcfR("my_vcf.gz")
##dataout <- vcfR2genind(outest)


## exploring the data once again (do it all the time!)

fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
fly_gen3$tab[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
nLoc(fly_gen3) # number of loci
## [1] 1550
nPop(fly_gen3) # number of sites
## [1] 15
nInd(fly_gen3) # number of individuals
## [1] 151
summary(fly_gen3$pop) # sample size
##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
## gl2geno(tab, outfile = "gl_geno", outpath = getwd(), verbose = NULL) ## Converting genlight object to geno object

## Run snmf algorithm

#snmf1 <- snmf("gl_geno.geno",
#             K = 1:10, # number of K ancestral populations to run
#             repetitions = 50, # 50 repetitions for each K
#             entropy = TRUE, # calculate cross-entropy
#             project = "new")


snmf1 <- load.snmfProject("gl_geno.snmfProject")


## extract Q-matrix for the best run

plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)

## extract the cross-entropy of all runs where K = 9

ce <- cross.entropy(snmf1, K = 9)
ce
##            K = 9
## run 1  0.3026330
## run 2  0.3006478
## run 3  0.2965800
## run 4  0.2934025
## run 5  0.2985019
## run 6  0.3038042
## run 7  0.3018757
## run 8  0.2907803
## run 9  0.2968350
## run 10 0.2961763
## run 11 0.2993841
## run 12 0.2929597
## run 13 0.2910719
## run 14 0.2818298
## run 15 0.2994594
## run 16 0.2894918
## run 17 0.2949548
## run 18 0.2914579
## run 19 0.2884607
## run 20 0.3000826
## run 21 0.2908541
## run 22 0.2981863
## run 23 0.2902612
## run 24 0.2904714
## run 25 0.3029123
## run 26 0.3011128
## run 27 0.2925949
## run 28 0.2941099
## run 29 0.3023984
## run 30 0.3020126
## run 31 0.2983558
## run 32 0.2997775
## run 33 0.2902648
## run 34 0.2993205
## run 35 0.2987137
## run 36 0.2952065
## run 37 0.2994274
## run 38 0.3042761
## run 39 0.2976461
## run 40 0.2993694
## run 41 0.2967565
## run 42 0.2944768
## run 43 0.2941358
## run 44 0.2882492
## run 45 0.2983567
## run 46 0.2942568
## run 47 0.2906855
## run 48 0.2999784
## run 49 0.3022837
## run 50 0.2979496
## find the run with the lowest cross-entropy

lowest.ce <- which.min(ce)
lowest.ce
## [1] 14
## extract Q-matrix for the best run

qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)
##            V1          V2         V3          V4          V5       V6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
##            V7          V8          V9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
## changing order of levels

pops <- fly_gen3$pop
levels(pops)
##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"
levels(pops)[3] <- "Charlott."


qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]

pops <- qmplot$pops
## png("admixture_revision.png", width = 7, height = 7, units = "in", res = 300)

## pdf("admixture_revision.pdf")

layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                2, 2, 2, 2,
                2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))

par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.342, line = 2.8, font = 2, family = "serif", las = 1)

par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n",  ylab = "Admixture proportion", las = 1, cex.lab = 1.4)

## adding population labels to the axis:

names <- unique(as.character(pops))

medians <- c()

for(i in 1:length(pops)){
    
    axis(1, at = median(which(pops == pops[i])), labels = "")
    medians <- c(medians, median(which(pops == pops[i])))
}

medians[1:10]
##  [1] 3 3 3 3 3 8 8 8 8 8
text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Pools", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)

## dev.off()

Figure 2. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k = 1 and k = 10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Tick marks on the x axis correspond to the median number of pools per locality. Colors indicate genetic clusters.



## label column names of qmatrix

ncol(qmatrix)
## [1] 9
cluster_names = c()

for(i in 1:ncol(qmatrix)){
  cluster_names[i] = paste("Cluster", i)
}

cluster_names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"
colnames(qmatrix) <- cluster_names
head(qmatrix)
##     Cluster 1   Cluster 2  Cluster 3   Cluster 4   Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320  0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000  0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973  0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600  0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964  0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400  0.319622
##     Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
dim(qmatrix)
## [1] 151   9
## add individual IDs

qmatrix$Ind <- indNames(fly_gen3)

## add site IDs

qmatrix$Site <- fly_gen3$pop
head(qmatrix)
##     Cluster 1   Cluster 2  Cluster 3   Cluster 4   Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320  0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000  0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973  0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600  0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964  0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400  0.319622
##     Cluster 7   Cluster 8   Cluster 9             Ind    Site
## 1 9.99639e-05 0.049494200 0.079542400   AT_gr_12_fall Austria
## 2 1.66984e-02 0.000099973 0.299490000 AT_gr_12_spring Austria
## 3 9.99730e-05 0.188641000 0.133300000    AT_Mau_14_01 Austria
## 4 9.99730e-05 0.516370000 0.147398000    AT_Mau_14_02 Austria
## 5 9.99640e-05 0.167587000 0.000099964    AT_Mau_15_50 Austria
## 6 9.99640e-05 0.425819000 0.000099964    AT_Mau_15_51 Austria
## calculate mean admixture proportions for each site

clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)
##   country             Ind   Cluster 1   Cluster 2  Cluster 3   Cluster 4
## 1 Austria   AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05
## 2 Austria AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01
## 3 Austria    AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02
## 4 Austria    AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05
## 5 Austria    AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05
## 6 Austria    AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05
##     Cluster 5 Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 0.000755320  0.613557 9.99639e-05 0.049494200 0.079542400
## 2 0.178439000  0.299589 1.66984e-02 0.000099973 0.299490000
## 3 0.000099973  0.328833 9.99730e-05 0.188641000 0.133300000
## 4 0.018455600  0.145086 9.99730e-05 0.516370000 0.147398000
## 5 0.000099964  0.184407 9.99640e-05 0.167587000 0.000099964
## 6 0.036707400  0.319622 9.99640e-05 0.425819000 0.000099964
str(avg_admix)
## 'data.frame':    151 obs. of  11 variables:
##  $ country  : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0001 0.0001 0.0001 0.0001 0.0133 ...
##  $ Cluster 2: num  0.0001 0.0001 0.0779 0.164 0.0437 ...
##  $ Cluster 3: num  0.25625 0.06443 0.24535 0.00835 0.5906 ...
##  $ Cluster 4: num  0.0001 0.1411 0.0257 0.0001 0.0001 ...
##  $ Cluster 5: num  0.000755 0.178439 0.0001 0.018456 0.0001 ...
##  $ Cluster 6: num  0.614 0.3 0.329 0.145 0.184 ...
##  $ Cluster 7: num  0.0001 0.0167 0.0001 0.0001 0.0001 ...
##  $ Cluster 8: num  0.0495 0.0001 0.1886 0.5164 0.1676 ...
##  $ Cluster 9: num  0.0795 0.2995 0.1333 0.1474 0.0001 ...
## import csv file containing coordinates

coor <- read.csv("coor.csv")
str(coor)
## 'data.frame':    16 obs. of  3 variables:
##  $ country: chr  "Austria" "Esparto" "Tuolumne" "Germany" ...
##  $ long   : num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ lat    : num  48.3 38.7 38 48.2 55.9 ...
head(coor)
##    country        long      lat
## 1  Austria   15.965000 48.28750
## 2  Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4  Germany    9.714757 48.19901
## 5  Denmark   10.212586 55.94513
## 6    Spain    1.279236 41.51821
admix <- merge(coor, avg_admix, by = "country")
str(admix)
## 'data.frame':    151 obs. of  13 variables:
##  $ country  : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ long     : num  16 16 16 16 16 ...
##  $ lat      : num  48.3 48.3 48.3 48.3 48.3 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0001 0.0001 0.0001 0.0001 0.0133 ...
##  $ Cluster 2: num  0.0001 0.0001 0.0779 0.164 0.0437 ...
##  $ Cluster 3: num  0.25625 0.06443 0.24535 0.00835 0.5906 ...
##  $ Cluster 4: num  0.0001 0.1411 0.0257 0.0001 0.0001 ...
##  $ Cluster 5: num  0.000755 0.178439 0.0001 0.018456 0.0001 ...
##  $ Cluster 6: num  0.614 0.3 0.329 0.145 0.184 ...
##  $ Cluster 7: num  0.0001 0.0167 0.0001 0.0001 0.0001 ...
##  $ Cluster 8: num  0.0495 0.0001 0.1886 0.5164 0.1676 ...
##  $ Cluster 9: num  0.0795 0.2995 0.1333 0.1474 0.0001 ...
head(admix)
##   country   long     lat             Ind   Cluster 1   Cluster 2  Cluster 3
## 1 Austria 15.965 48.2875   AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200
## 2 Austria 15.965 48.2875 AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830
## 3 Austria 15.965 48.2875    AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100
## 4 Austria 15.965 48.2875    AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852
## 5 Austria 15.965 48.2875    AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000
## 6 Austria 15.965 48.2875    AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700
##     Cluster 4   Cluster 5 Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.000755320  0.613557 9.99639e-05 0.049494200 0.079542400
## 2 1.41056e-01 0.178439000  0.299589 1.66984e-02 0.000099973 0.299490000
## 3 2.57084e-02 0.000099973  0.328833 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.018455600  0.145086 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.000099964  0.184407 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.036707400  0.319622 9.99640e-05 0.425819000 0.000099964
colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")

pies <- admix[ , c(1:3, 5:13)]

meanpies <- aggregate(pies[2:12], by = list(pies$country), mean)



Plotting admixture proportions on a map

## Plotting map

## png("map_revision.png", width = 7, height = 7, units = "in", res = 300)

## pdf("map_revision.pdf")

quartz()

layout(matrix(c(1, 1, 1, 1,
                1, 1, 1, 1,
                2, 2, 2, 2,
                2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))

map("state", fill = TRUE, col = "beige", border = "gray", asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-120, 29, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")

## Adding pie charts to the map

for(i in 1:nrow(meanpies)){
    add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 1.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
    
}


newmap <- getMap(resolution = "low")
plot(newmap, col = "beige", border = "gray", xlim = c(-10, 40), ylim = c(35, 71), asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-16, 40, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")


## Adding pie charts to the map

for(i in 1:nrow(meanpies)){
    add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 2.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
    
}

## dev.off()

Figure 3. Mean admixture proportions across populations of D. melanogaster surveyed in America (upper panel) and Europe (lower panel). Colors in the pie charts represent genetic clusters.





Isolation by Environment and Isolation by Distance analyses

unique(fly_gen3$pop)
##  [1] Austria         Esparto         Tuolumne        Germany        
##  [5] Denmark         Spain           Finland         France         
##  [9] NY-MA           MI-WI           PA              Russia         
## [13] Turkey          Ukraine         Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine
obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)

idx <- 1

for(i in unique(fly_gen3$pop)){
    long <- admix$long[admix$country == i][1]
    lat <- admix$lat[admix$country == i][1]
    country <- i
    obj[idx, ] <- c(country, long, lat)
    idx = idx + 1

}

obj
##       [,1]              [,2]          [,3]         
##  [1,] "Austria"         "15.965"      "48.2875"    
##  [2,] "Esparto"         "-122.046"    "38.68"      
##  [3,] "Tuolumne"        "-120.26"     "37.97"      
##  [4,] "Germany"         "9.7147565"   "48.199012"  
##  [5,] "Denmark"         "10.212586"   "55.945128"  
##  [6,] "Spain"           "1.279236"    "41.518208"  
##  [7,] "Finland"         "23.52"       "61.1"       
##  [8,] "France"          "3.5442054"   "46.8656662" 
##  [9,] "NY-MA"           "-74.085"     "42.445"     
## [10,] "MI-WI"           "-88.04915"   "42.61055"   
## [11,] "PA"              "-76.6350005" "40.3366975" 
## [12,] "Russia"          "33.235277"   "58.02444"   
## [13,] "Turkey"          "32.260328"   "40.231444"  
## [14,] "Ukraine"         "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47"      "38"
obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)
## 'data.frame':    15 obs. of  2 variables:
##  $ x: num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ y: num  48.3 38.7 38 48.2 55.9 ...
fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3
## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
##    @other: a list containing: x  y
## genetic distance

GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))

## geographic distance

Dgeo <- round(dist(fly_gen3$other), 3)

## making sure the order of rows and columns are the same in all of the matrices

ord.gen <- as.matrix(GD.pop.PairwiseFst.hierfstat)
ord.geo <- as.matrix(Dgeo)

ord.geo
##                 Austria Esparto Tuolumne Germany Denmark   Spain Finland
## Austria           0.000 138.345  136.615   6.251   9.578  16.171  14.874
## Esparto         138.345   0.000    1.922 132.104 133.381 123.358 147.282
## Tuolumne        136.615   1.922    0.000 130.377 131.705 121.591 145.629
## Germany           6.251 132.104  130.377   0.000   7.762  10.761  18.895
## Denmark           9.578 133.381  131.705   7.762   0.000  16.969  14.271
## Spain            16.171 123.358  121.591  10.761  16.969   0.000  29.633
## Finland          14.874 147.282  145.629  18.895  14.271  29.633   0.000
## France           12.502 125.857  124.123   6.313  11.265   5.807  24.529
## NY-MA            90.239  48.109   46.391  83.997  85.372  75.370  99.372
## MI-WI           104.169  34.223   32.543  97.924  99.162  89.335 113.091
## PA               92.941  45.441   43.689  86.707  88.239  77.923 102.285
## Russia           19.826 156.482  154.800  25.490  23.116  35.967  10.190
## Turkey           18.178 154.314  152.537  23.912  27.074  31.008  22.625
## Ukraine          14.250 152.594  150.865  20.491  20.977  29.962  13.392
## Charlottesville  94.994  43.581   41.790  88.773  90.480  79.827 104.573
##                  France   NY-MA   MI-WI      PA  Russia  Turkey Ukraine
## Austria          12.502  90.239 104.169  92.941  19.826  18.178  14.250
## Esparto         125.857  48.109  34.223  45.441 156.482 154.314 152.594
## Tuolumne        124.123  46.391  32.543  43.689 154.800 152.537 150.865
## Germany           6.313  83.997  97.924  86.707  25.490  23.912  20.491
## Denmark          11.265  85.372  99.162  88.239  23.116  27.074  20.977
## Spain             5.807  75.370  89.335  77.923  35.967  31.008  29.962
## Finland          24.529  99.372 113.091 102.285  10.190  22.625  13.392
## France            0.000  77.755  91.692  80.445  31.719  29.473  26.749
## NY-MA            77.755   0.000  13.965   3.309 108.445 106.368 104.488
## MI-WI            91.692  13.965   0.000  11.638 122.260 120.333 118.414
## PA               80.445   3.309  11.638   0.000 111.285 108.895 107.191
## Russia           31.719 108.445 122.260 111.285   0.000  17.820   9.085
## Turkey           29.473 106.368 120.333 108.895  17.820   0.000   9.477
## Ukraine          26.749 104.488 118.414 107.191   9.085   9.477   0.000
## Charlottesville  82.492   6.244  10.631   2.971 113.486 110.753 109.240
##                 Charlottesville
## Austria                  94.994
## Esparto                  43.581
## Tuolumne                 41.790
## Germany                  88.773
## Denmark                  90.480
## Spain                    79.827
## Finland                 104.573
## France                   82.492
## NY-MA                     6.244
## MI-WI                    10.631
## PA                        2.971
## Russia                  113.486
## Turkey                  110.753
## Ukraine                 109.240
## Charlottesville           0.000
ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen
##                 Austria Esparto Tuolumne Germany Denmark  Spain Finland France
## Austria          0.0000  0.0557   0.0588  0.0128  0.0181 0.0165  0.0151 0.0212
## Esparto          0.0557  0.0000  -0.0013  0.0441  0.0490 0.0446  0.0547 0.0414
## Tuolumne         0.0588 -0.0013   0.0000  0.0468  0.0533 0.0479  0.0601 0.0457
## Germany          0.0128  0.0441   0.0468  0.0000  0.0170 0.0077  0.0136 0.0100
## Denmark          0.0181  0.0490   0.0533  0.0170  0.0000 0.0256  0.0246 0.0186
## Spain            0.0165  0.0446   0.0479  0.0077  0.0256 0.0000  0.0200 0.0158
## Finland          0.0151  0.0547   0.0601  0.0136  0.0246 0.0200  0.0000 0.0222
## France           0.0212  0.0414   0.0457  0.0100  0.0186 0.0158  0.0222 0.0000
## NY-MA            0.0439  0.0089   0.0082  0.0335  0.0403 0.0295  0.0435 0.0334
## MI-WI            0.0540  0.0130   0.0094  0.0412  0.0501 0.0396  0.0492 0.0422
## PA               0.0541  0.0141   0.0099  0.0414  0.0519 0.0393  0.0521 0.0440
## Russia           0.0169  0.0535   0.0605  0.0210  0.0234 0.0318  0.0192 0.0222
## Turkey           0.0139  0.0600   0.0618  0.0188  0.0217 0.0264  0.0163 0.0280
## Ukraine          0.0137  0.0627   0.0674  0.0196  0.0234 0.0308  0.0214 0.0281
## Charlottesville  0.0582  0.0172   0.0153  0.0466  0.0541 0.0466  0.0574 0.0448
##                  NY-MA  MI-WI     PA Russia Turkey Ukraine Charlottesville
## Austria         0.0439 0.0540 0.0541 0.0169 0.0139  0.0137          0.0582
## Esparto         0.0089 0.0130 0.0141 0.0535 0.0600  0.0627          0.0172
## Tuolumne        0.0082 0.0094 0.0099 0.0605 0.0618  0.0674          0.0153
## Germany         0.0335 0.0412 0.0414 0.0210 0.0188  0.0196          0.0466
## Denmark         0.0403 0.0501 0.0519 0.0234 0.0217  0.0234          0.0541
## Spain           0.0295 0.0396 0.0393 0.0318 0.0264  0.0308          0.0466
## Finland         0.0435 0.0492 0.0521 0.0192 0.0163  0.0214          0.0574
## France          0.0334 0.0422 0.0440 0.0222 0.0280  0.0281          0.0448
## NY-MA           0.0000 0.0057 0.0017 0.0474 0.0482  0.0535          0.0031
## MI-WI           0.0057 0.0000 0.0042 0.0537 0.0537  0.0616          0.0049
## PA              0.0017 0.0042 0.0000 0.0596 0.0564  0.0646          0.0045
## Russia          0.0474 0.0537 0.0596 0.0000 0.0167  0.0067          0.0597
## Turkey          0.0482 0.0537 0.0564 0.0167 0.0000  0.0129          0.0574
## Ukraine         0.0535 0.0616 0.0646 0.0067 0.0129  0.0000          0.0656
## Charlottesville 0.0031 0.0049 0.0045 0.0597 0.0574  0.0656          0.0000
fst.dist <- round(as.dist(ord.gen), 3)
fst.dist
##                 Austria Esparto Tuolumne Germany Denmark  Spain Finland France
## Esparto           0.056                                                       
## Tuolumne          0.059  -0.001                                               
## Germany           0.013   0.044    0.047                                      
## Denmark           0.018   0.049    0.053   0.017                              
## Spain             0.016   0.045    0.048   0.008   0.026                      
## Finland           0.015   0.055    0.060   0.014   0.025  0.020               
## France            0.021   0.041    0.046   0.010   0.019  0.016   0.022       
## NY-MA             0.044   0.009    0.008   0.034   0.040  0.029   0.044  0.033
## MI-WI             0.054   0.013    0.009   0.041   0.050  0.040   0.049  0.042
## PA                0.054   0.014    0.010   0.041   0.052  0.039   0.052  0.044
## Russia            0.017   0.054    0.060   0.021   0.023  0.032   0.019  0.022
## Turkey            0.014   0.060    0.062   0.019   0.022  0.026   0.016  0.028
## Ukraine           0.014   0.063    0.067   0.020   0.023  0.031   0.021  0.028
## Charlottesville   0.058   0.017    0.015   0.047   0.054  0.047   0.057  0.045
##                  NY-MA  MI-WI     PA Russia Turkey Ukraine
## Esparto                                                   
## Tuolumne                                                  
## Germany                                                   
## Denmark                                                   
## Spain                                                     
## Finland                                                   
## France                                                    
## NY-MA                                                     
## MI-WI            0.006                                    
## PA               0.002  0.004                             
## Russia           0.047  0.054  0.060                      
## Turkey           0.048  0.054  0.056  0.017               
## Ukraine          0.054  0.062  0.065  0.007  0.013        
## Charlottesville  0.003  0.005  0.004  0.060  0.057   0.066
Dgeo
##                 Austria Esparto Tuolumne Germany Denmark   Spain Finland
## Esparto         138.345                                                 
## Tuolumne        136.615   1.922                                         
## Germany           6.251 132.104  130.377                                
## Denmark           9.578 133.381  131.705   7.762                        
## Spain            16.171 123.358  121.591  10.761  16.969                
## Finland          14.874 147.282  145.629  18.895  14.271  29.633        
## France           12.502 125.857  124.123   6.313  11.265   5.807  24.529
## NY-MA            90.239  48.109   46.391  83.997  85.372  75.370  99.372
## MI-WI           104.169  34.223   32.543  97.924  99.162  89.335 113.091
## PA               92.941  45.441   43.689  86.707  88.239  77.923 102.285
## Russia           19.826 156.482  154.800  25.490  23.116  35.967  10.190
## Turkey           18.178 154.314  152.537  23.912  27.074  31.008  22.625
## Ukraine          14.250 152.594  150.865  20.491  20.977  29.962  13.392
## Charlottesville  94.994  43.581   41.790  88.773  90.480  79.827 104.573
##                  France   NY-MA   MI-WI      PA  Russia  Turkey Ukraine
## Esparto                                                                
## Tuolumne                                                               
## Germany                                                                
## Denmark                                                                
## Spain                                                                  
## Finland                                                                
## France                                                                 
## NY-MA            77.755                                                
## MI-WI            91.692  13.965                                        
## PA               80.445   3.309  11.638                                
## Russia           31.719 108.445 122.260 111.285                        
## Turkey           29.473 106.368 120.333 108.895  17.820                
## Ukraine          26.749 104.488 118.414 107.191   9.085   9.477        
## Charlottesville  82.492   6.244  10.631   2.971 113.486 110.753 109.240
NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)

NPP.data.ext.dis <- extract(NPP.data, obj)
head(NPP.data.ext.dis)
##       npp_geotiff
## [1,] 264522317824
## [2,] 391191265280
## [3,] 367969927168
## [4,] 348927459328
## [5,]  25091205120
## [6,] 268916719616
rownames(NPP.data.ext.dis) <- rownames(obj)

DNPP <- round(dist(log(NPP.data.ext.dis)), 3)
DNPP
##                 Austria Esparto Tuolumne Germany Denmark Spain Finland France
## Esparto           0.391                                                      
## Tuolumne          0.330   0.061                                              
## Germany           0.277   0.114    0.053                                     
## Denmark           2.355   2.747    2.685   2.632                             
## Spain             0.016   0.375    0.314   0.260   2.372                     
## Finland           0.610   1.001    0.940   0.886   1.746 0.626               
## France            0.577   0.185    0.247   0.300   2.932 0.560   1.186       
## NY-MA             0.342   0.050    0.011   0.065   2.697 0.325   0.951  0.235
## MI-WI             0.236   0.155    0.094   0.041   2.592 0.220   0.846  0.340
## PA                0.452   0.061    0.122   0.175   2.808 0.436   1.062  0.124
## Russia            0.549   0.940    0.879   0.826   1.806 0.565   0.061  1.126
## Turkey            0.626   1.017    0.956   0.903   1.729 0.643   0.017  1.203
## Ukraine           0.075   0.466    0.405   0.352   2.280 0.092   0.534  0.652
## Charlottesville   0.526   0.134    0.196   0.249   2.881 0.509   1.135  0.051
##                 NY-MA MI-WI    PA Russia Turkey Ukraine
## Esparto                                                
## Tuolumne                                               
## Germany                                                
## Denmark                                                
## Spain                                                  
## Finland                                                
## France                                                 
## NY-MA                                                  
## MI-WI           0.105                                  
## PA              0.111 0.216                            
## Russia          0.891 0.785 1.001                      
## Turkey          0.968 0.862 1.078  0.077               
## Ukraine         0.417 0.311 0.527  0.474  0.551        
## Charlottesville 0.184 0.290 0.074  1.075  1.152   0.601
## R script for 'MMRR' function that performs Multiple Matrix Regression with Randomization analysis.

source("MMRR.R")

distances <- list(Dgeo = as.matrix(Dgeo), DNPP = as.matrix(DNPP))

MMRR(as.matrix(fst.dist), distances)
## $r.squared
## r.squared 
##   0.86385 
## 
## $coefficients
##    Intercept         Dgeo         DNPP 
## 0.0072219678 0.0003551048 0.0036128387 
## 
## $tstatistic
## Intercept(t)      Dgeo(t)      DNPP(t) 
##     5.380919    24.813658     3.906555 
## 
## $tpvalue
## Intercept(p)      Dgeo(p)      DNPP(p) 
##        0.999        0.001        0.001 
## 
## $Fstatistic
## F-statistic 
##    323.5867 
## 
## $Fpvalue
## F p-value 
##     0.001
## partial mantel test

part.mant <- mantel.partial(fst.dist, DNPP, Dgeo, method = "pearson", permutations = 1000)
part.mant
## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = DNPP, zdis = Dgeo, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.3608 
##       Significance: 0.001998 
## 
## Upper quantiles of permutations (null model):
##    90%    95%  97.5%    99% 
## 0.0912 0.1217 0.1531 0.1984 
## Permutation: free
## Number of permutations: 1000
part.mant2 <- mantel.partial(fst.dist, Dgeo, DNPP, method = "pearson", permutations = 1000)
part.mant2
## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = DNPP, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.9262 
##       Significance: 0.000999 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.101 0.167 0.223 0.338 
## Permutation: free
## Number of permutations: 1000
## mantel test

ibd <- mantel.randtest(fst.dist, Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
## 
## Observation: 0.9184113 
## 
## Based on 999 replicates
## Simulated p-value: 0.001 
## Alternative hypothesis: greater 
## 
##     Std.Obs Expectation    Variance 
## 9.173596292 0.003686283 0.009942643
quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant

mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor
## $mgram
##         lag ngroup     mantelr  pval          llim        ulim
## [1,]   9.66     27  0.64245382 0.001  5.918708e-01  0.69524902
## [2,]  28.98     18  0.28493265 0.014  2.262106e-01  0.34487195
## [3,]  48.30      6  0.27711014 0.015  1.829388e-01  0.34511218
## [4,]  67.62      1  0.02464136 0.968 -5.108269e-17  0.03538879
## [5,]  86.94     16 -0.23755747 0.022 -3.308312e-01 -0.16036366
## [6,] 106.26     16 -0.43851519 0.001 -5.317588e-01 -0.36207546
## [7,] 125.58     11 -0.27585827 0.016 -3.384386e-01 -0.17825522
## [8,] 144.90      8 -0.39397231 0.002 -4.881426e-01 -0.28195888
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"
mantCor2 <- mgram(fst.dist, DNPP, nperm = 1000)
mantCor2
## $mgram
##            lag ngroup     mantelr  pval          llim          ulim
## [1,] 0.1825625     44  0.36681725 0.001  2.370974e-01  4.910210e-01
## [2,] 0.5476875     23 -0.07860655 0.336 -2.271801e-01  6.511911e-02
## [3,] 0.9128125     19 -0.36758903 0.005 -4.938220e-01 -2.405678e-01
## [4,] 1.2779375      5 -0.03904606 0.654 -1.860046e-01  1.023411e-01
## [5,] 1.6430625      3  0.09357366 0.308 -2.944606e-17  1.305096e-01
## [6,] 2.0081875      0  0.00000000 1.000  0.000000e+00  0.000000e+00
## [7,] 2.3733125      3  0.10248140 0.260 -1.957163e-17  1.362118e-01
## [8,] 2.7384375      7 -0.15494628 0.064 -2.108242e-01  7.556146e-18
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"
plot(mantCor, las = 1)

plot(mantCor2, las = 1)

## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations


dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)

dens2 <- MASS::kde2d(as.vector(DNPP), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)

## png("distance.png", width = 7, height = 7, units = "in", res = 300)

## pdf("distance.pdf")

layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))


par(mar = c(4.2, 4.1, 1, 1))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.07, line = 2.6, font = 2, family = "serif", las = 1)

par(mar = c(4, 4.1, 1, 1))
plot(as.vector(DNPP), as.vector(fst.dist), pch = 20, xlab = "NPP Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(DNPP)), lwd = 2)
lines(loess.smooth(as.vector(DNPP), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.07, line = 3.1, font = 2, family = "serif", las = 1)

par(mar = c(4.2, 4.1, 1, 1))
plot(mantCor, las = 1, cex.axis = 1, cex.lab = 1.6, xlab = "Geographic Distance")
mtext("(c)", side = 2, at = 0.7, line = 2.6, font = 2, family = "serif", las = 1)

par(mar = c(4, 4.1, 1, 1))
plot(mantCor2, las = 1, cex.axis = 1, cex.lab = 1.6, ylim = c(-0.4, 0.6), xlab = "NPP Distance")
mtext("(d)", side = 2, at = 0.65, line = 3.1, font = 2, family = "serif", las = 1)

## dev.off()

Figure 6. Patterns of isolation by distance and isolation by environment according to Partial Mantel tests and Correlograms. (a-b) Geographic and net primary production distances as a function of genetic distance. Colors represent estimated probability densities and the red line a smoothed local mean. (c-d) Mantel correlation at different distance classes as a function of geographic and net primary production distances. Filled dots indicate significant correlations.



Analysis of seasonlity and adaptation test

## extracting environmental variables from WorldClim and other sources


mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)
##   country   long     lat             Ind
## 1 Austria 15.965 48.2875   AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875    AT_Mau_14_01
## 4 Austria 15.965 48.2875    AT_Mau_14_02
## 5 Austria 15.965 48.2875    AT_Mau_15_50
## 6 Austria 15.965 48.2875    AT_Mau_15_51
mat_adap$season <- rep(NA, nrow(mat_adap))

idx <- 1
for(i in mat_adap$Ind){
    obj <- unique(afis$season[afis$sampleId == i])
    mat_adap[idx, 5] <- obj
    idx = idx + 1
}

mat_adap$season[mat_adap$season == "frost"] <- "fall"
head(mat_adap)
##   country   long     lat             Ind season
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall
## extracting NPP


NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)

prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)
##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)
##       npp_geotiff
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824
NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)
##            NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824
##is.na(NPP.data.ext)


## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)

files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
##  [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
##  [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
##  [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
##  [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
##  [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
##  [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
##  [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
##  [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif" 
##  [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif" 
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif" 
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif" 
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif" 
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif" 
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
tem_sea <- stack(files[10])

coord <- mat_adap[ , c(2, 3)]
head(coord)
##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)

variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)
##   wc2.1_30s_bio_15 wc2.1_30s_bio_4      NPP
## 1         37.24262        744.4373 26.30119
## 2         37.24262        744.4373 26.30119
## 3         37.24262        744.4373 26.30119
## 4         37.24262        744.4373 26.30119
## 5         37.24262        744.4373 26.30119
## 6         37.24262        744.4373 26.30119
colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)
##   precseason tempseason   logNPP
## 1   37.24262   744.4373 26.30119
## 2   37.24262   744.4373 26.30119
## 3   37.24262   744.4373 26.30119
## 4   37.24262   744.4373 26.30119
## 5   37.24262   744.4373 26.30119
## 6   37.24262   744.4373 26.30119
env <- cbind(mat_adap, variables)
head(env)
##   country   long     lat             Ind season precseason tempseason   logNPP
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall   37.24262   744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring   37.24262   744.4373 26.30119
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring   37.24262   744.4373 26.30119
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall   37.24262   744.4373 26.30119
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring   37.24262   744.4373 26.30119
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall   37.24262   744.4373 26.30119
## PCA of environmental variables

x <- tab(fly_gen3, NA.method = "mean")

snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)
## [1]  151 3102
gen <- snppca[order(snppca$country), ]

merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]
##               Ind 162333.A 162333.C 162335.C 162335.T
## 1   AT_gr_12_fall        1        1        1        1
## 2 AT_gr_12_spring        1        1        1        1
## 3    AT_Mau_14_01        0        2        1        1
## 4    AT_Mau_14_02        1        1        1        1
## 5    AT_Mau_15_50        1        1        1        1
rownames(merging) <- merging[ , 1]
dim(merging)
## [1]  151 3109
merging[1:5, 3101:3109]
##                 163997.T country.x country.y   long     lat season precseason
## AT_gr_12_fall          0   Austria   Austria 15.965 48.2875   fall   37.24262
## AT_gr_12_spring        0   Austria   Austria 15.965 48.2875 spring   37.24262
## AT_Mau_14_01           0   Austria   Austria 15.965 48.2875 spring   37.24262
## AT_Mau_14_02           1   Austria   Austria 15.965 48.2875   fall   37.24262
## AT_Mau_15_50           0   Austria   Austria 15.965 48.2875 spring   37.24262
##                 tempseason   logNPP
## AT_gr_12_fall     744.4373 26.30119
## AT_gr_12_spring   744.4373 26.30119
## AT_Mau_14_01      744.4373 26.30119
## AT_Mau_14_02      744.4373 26.30119
## AT_Mau_15_50      744.4373 26.30119
merging <- merging[ , -c(1, 3102:3109)]
dim(merging)
## [1]  151 3100
merging[1:5, 3090:3100]
##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall          1        1        2        0
## AT_gr_12_spring        1        1        2        0
## AT_Mau_14_01           1        1        2        0
## AT_Mau_14_02           1        1        1        1
## AT_Mau_15_50           1        1        2        0
genomic <- merging

env$Ind <- as.factor(env$Ind)
env$Ind <- as.character(levels(env$Ind))
dim(snppca)
## [1]  151 3102
snppca[1:5, 1:5]
##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2
snppca[1:5, 3090:3102]
##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T             Ind country
## AT_gr_12_fall          1        1        2        0   AT_gr_12_fall Austria
## AT_gr_12_spring        1        1        2        0 AT_gr_12_spring Austria
## AT_Mau_14_01           1        1        2        0    AT_Mau_14_01 Austria
## AT_Mau_14_02           1        1        1        1    AT_Mau_14_02 Austria
## AT_Mau_15_50           1        1        2        0    AT_Mau_15_50 Austria
snppca <- snppca[ , -c(3101, 3102)]
dim(snppca)
## [1]  151 3100
snppca[1:5, 3090:3100]
##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall          1        1        2        0
## AT_gr_12_spring        1        1        2        0
## AT_Mau_14_01           1        1        2        0
## AT_Mau_14_02           1        1        1        1
## AT_Mau_15_50           1        1        2        0
identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order
## [1] TRUE
## How many seasons per country are there?

table(data.frame(season = env$season, country = env$country))
##         country
## season   Austria Charlottesville Denmark Esparto Finland France Germany MI-WI
##   fall         3               3       4       3       2      4       6     5
##   spring       3               3       1       2       4      6       6     4
##         country
## season   NY-MA PA Russia Spain Tuolumne Turkey Ukraine
##   fall       3  9      3     4        4      4      14
##   spring     3  8      3     3        1      9      24
xtable(table(data.frame(season = env$season, country = env$country)), caption = "caption here")
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Tue Sep  5 00:01:23 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrrrrrrrrr}
##   \hline
##  & Austria & Charlottesville & Denmark & Esparto & Finland & France & Germany & MI-WI & NY-MA & PA & Russia & Spain & Tuolumne & Turkey & Ukraine \\ 
##   \hline
## fall &   3 &   3 &   4 &   3 &   2 &   4 &   6 &   5 &   3 &   9 &   3 &   4 &   4 &   4 &  14 \\ 
##   spring &   3 &   3 &   1 &   2 &   4 &   6 &   6 &   4 &   3 &   8 &   3 &   3 &   1 &   9 &  24 \\ 
##    \hline
## \end{tabular}
## \caption{caption here} 
## \end{table}
names(env)
## [1] "country"    "long"       "lat"        "Ind"        "season"    
## [6] "precseason" "tempseason" "logNPP"
## CCA

gen.cca <- cca(genomic ~ season*country, env)
gen.cca
## Call: cca(formula = genomic ~ season * country, data = env)
## 
##               Inertia Proportion Rank
## Total         0.21786    1.00000     
## Constrained   0.06199    0.28453   29
## Unconstrained 0.15587    0.71547  121
## Inertia is scaled Chi-square 
## 
## Eigenvalues for constrained axes:
##     CCA1     CCA2     CCA3     CCA4     CCA5     CCA6     CCA7     CCA8 
## 0.016564 0.005943 0.003270 0.003129 0.002648 0.002531 0.002046 0.001977 
##     CCA9    CCA10    CCA11    CCA12    CCA13    CCA14    CCA15    CCA16 
## 0.001709 0.001637 0.001583 0.001509 0.001448 0.001341 0.001248 0.001224 
##    CCA17    CCA18    CCA19    CCA20    CCA21    CCA22    CCA23    CCA24 
## 0.001160 0.001097 0.001069 0.001030 0.000996 0.000987 0.000961 0.000899 
##    CCA25    CCA26    CCA27    CCA28    CCA29 
## 0.000855 0.000828 0.000826 0.000780 0.000692 
## 
## Eigenvalues for unconstrained axes:
##      CA1      CA2      CA3      CA4      CA5      CA6      CA7      CA8 
## 0.014281 0.005240 0.003714 0.003412 0.003133 0.002981 0.002858 0.002616 
## (Showing 8 of 121 unconstrained eigenvalues)
anova(gen.cca)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = genomic ~ season * country, data = env)
##           Df ChiSquare      F Pr(>F)    
## Model     29  0.061989 1.6593  0.001 ***
## Residual 121  0.155874                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
env$country <- as.factor(env$country)
levels(env$country)
##  [1] "Austria"         "Charlottesville" "Denmark"         "Esparto"        
##  [5] "Finland"         "France"          "Germany"         "MI-WI"          
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"
cols <- env$country
cols <- as.numeric(cols)

ncol <- nPop(fly_gen3)
palette <- distinctColorPalette(ncol)

colors <- rep()

for(i in cols){
    col <- palette[i]
    colors <- c(colors, col)
}

colors
##   [1] "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#8967DC" "#728FD5"
##   [8] "#728FD5" "#728FD5" "#728FD5" "#728FD5" "#728FD5" "#83C2D9" "#83C2D9"
##  [15] "#83C2D9" "#83C2D9" "#83C2D9" "#DAB08F" "#DAB08F" "#DAB08F" "#DAB08F"
##  [22] "#DAB08F" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2" "#BC3FE2"
##  [29] "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863" "#DF6863"
##  [36] "#DF6863" "#DF6863" "#DF6863" "#85967A" "#85967A" "#85967A" "#85967A"
##  [43] "#85967A" "#85967A" "#85967A" "#85967A" "#85967A" "#85967A" "#85967A"
##  [50] "#85967A" "#B1E750" "#B1E750" "#B1E750" "#B1E750" "#B1E750" "#B1E750"
##  [57] "#B1E750" "#B1E750" "#B1E750" "#6EE189" "#6EE189" "#6EE189" "#6EE189"
##  [64] "#6EE189" "#6EE189" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C"
##  [71] "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C"
##  [78] "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#D3E59C" "#E2BC4D" "#E2BC4D"
##  [85] "#E2BC4D" "#E2BC4D" "#E2BC4D" "#E2BC4D" "#D8DFD5" "#D8DFD5" "#D8DFD5"
##  [92] "#D8DFD5" "#D8DFD5" "#D8DFD5" "#D8DFD5" "#D7ABD2" "#D7ABD2" "#D7ABD2"
##  [99] "#D7ABD2" "#D7ABD2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2"
## [106] "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2" "#7BE6D2"
## [113] "#7BE6D2" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [120] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [127] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [134] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [141] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## [148] "#DA6AC0" "#DA6AC0" "#DA6AC0" "#DA6AC0"
## png("cca_revision.png", width = 7, height = 7, units = "in", res = 300)

## pdf("cca_revision.pdf")

plot(gen.cca, display = "sites", type = "n", las = 1)
with(gen.cca, points(gen.cca, display = "sites", pch = 21, bg = colors, col = colors))
##with(env[3], ordiellipse(gen.cca, env$country, kind = "se", conf = 0.95, col = "gray"))
##with(env[3], ordihull(gen.cca, env$country, col = colors, lty = 2))
with(env[3], ordispider(gen.cca, env$country, col = unique(colors), label = TRUE))

##legend("topright", legend = levels(env$country), pch = 19, col = unique(cols), bty = "n", cex = 0.8)

## dev.off()

Figure 4. Constrained correspondence analysis depicting the difference in allele frequency across populations mediated by the interaction between season and locality. Filled dots represent pooled samples and “spider” diagrams connect the pools to the population centroid.



## Detecting selection

load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.qv1.RData")

fly.pv1.chr2 <- fly.pv1
fly.qv1.chr2 <- fly.qv1


rm(fly.pv1)
rm(fly.qv1)

head(env)
##   country   long     lat             Ind season precseason tempseason   logNPP
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall   37.24262   744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring   37.24262   744.4373 26.30119
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring   37.24262   744.4373 26.30119
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall   37.24262   744.4373 26.30119
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring   37.24262   744.4373 26.30119
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall   37.24262   744.4373 26.30119
##env.pca <- rda(env[ , c(2, 3, 6, 7, 8)], scale = T)
env.pca <- rda(env[ , c(6, 7, 8)], scale = T)
summary(env.pca)$cont
## $importance
## Importance of components:
##                          PC1    PC2    PC3
## Eigenvalue            1.2227 0.9646 0.8127
## Proportion Explained  0.4076 0.3215 0.2709
## Cumulative Proportion 0.4076 0.7291 1.0000
par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")

round(scores(env.pca, choices = 1:3, display = "species", scaling = 0), digits = 3)
##               PC1    PC2    PC3
## precseason -0.655  0.273 -0.704
## tempseason  0.658 -0.251 -0.710
## logNPP      0.371  0.929  0.016
## attr(,"const")
## [1] 4.605779
pred.PC2 <- scores(env.pca, choices = 2, display = "sites", scaling = 0)

K <- 9

fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC2, K = K) ## change K as you see fit

fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC2, lfmm = fly.lfmm1, calibrate = "gif")

names(fly.pv1) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values
## [1] "B"                 "epsilon.sigma2"    "B.sigma2"         
## [4] "score"             "pvalue"            "gif"              
## [7] "calibrated.score2" "calibrated.pvalue"
## let’s look at the genomic inflation factor (GIF)

fly.pv1$gif
##      PC2 
## 1.359424
fly.pv1.chr2$gif
##      PC2 
## 1.239521
## an appropriately calibrated set of tests will have a GIF of around 1

## let’s look at how application of the GIF to the p-values impacts the p-value distribution


quartz()
layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))

par(mar = c(5, 5, 1, 1), las = 1)

hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")
hist(fly.pv1$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 400))
legend("top", "K = 9", bty = "n")

hist(fly.pv1.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 150000))
legend("top", "K = 8", bty = "n")
hist(fly.pv1.chr2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 150000))
legend("top", "K = 8", bty = "n")

## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)

fly.qv1 <- qvalue(fly.pv1$calibrated.pvalue)$qvalues
length(which(fly.qv1 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 4
fly.qv1.chr2 <- qvalue(fly.pv1.chr2$calibrated.pvalue)$qvalues
length(which(fly.qv1.chr2 < 0.05)) ## how many SNPs have an FDR < 5%?
## [1] 142
## using K = 9, the default GIF correction, and an FDR threshold of 0.05, I detected 2 candidate SNPs under selection in response to the PC2 environmental predictor

(fly.FDR <- colnames(snppca)[which(fly.qv1 < 0.05)]) ## identify which SNPs these are
## [1] "162922.C" "162922.A" "163269.T" "163269.C"
dim(snppca)
## [1]  151 3100
rm(snppca)

load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/snppca_chr2.RData")

dim(snppca)
## [1]     151 1862322
(fly.FDR.chr2 <- colnames(snppca)[which(fly.qv1.chr2 < 0.05)]) ## identify which SNPs these are
##   [1] "3205.G"   "3205.A"   "13746.A"  "13746.G"  "20886.T"  "20886.A" 
##   [7] "27909.C"  "27909.G"  "38816.C"  "38816.T"  "42047.G"  "42047.A" 
##  [13] "42049.G"  "42049.A"  "44237.C"  "44237.A"  "62254.G"  "62254.T" 
##  [19] "77605.T"  "77605.C"  "84995.C"  "84995.A"  "87409.C"  "87409.A" 
##  [25] "95522.T"  "95522.C"  "105438.T" "105438.G" "106073.G" "106073.C"
##  [31] "110380.A" "110380.T" "120960.A" "120960.T" "133844.T" "133844.C"
##  [37] "163269.T" "163269.C" "169348.G" "169348.C" "170587.G" "170587.A"
##  [43] "178736.T" "178736.C" "211689.G" "211689.T" "224757.T" "224757.G"
##  [49] "249489.G" "249489.A" "271578.A" "271578.C" "271957.G" "271957.A"
##  [55] "293046.G" "293046.A" "295197.T" "295197.C" "295212.C" "295212.T"
##  [61] "309911.C" "309911.A" "311205.G" "311205.A" "337949.C" "337949.T"
##  [67] "347923.A" "347923.G" "348847.C" "348847.T" "377565.A" "377565.G"
##  [73] "387711.G" "387711.A" "388835.C" "388835.T" "415369.G" "415369.T"
##  [79] "420886.C" "420886.T" "425629.G" "425629.A" "425633.G" "425633.C"
##  [85] "457451.C" "457451.A" "462795.G" "462795.C" "504846.T" "504846.C"
##  [91] "529356.T" "529356.G" "529357.G" "529357.C" "533630.C" "533630.T"
##  [97] "568526.C" "568526.G" "590620.G" "590620.A" "639512.A" "639512.G"
## [103] "639937.T" "639937.C" "647272.A" "647272.T" "657788.C" "657788.A"
## [109] "699680.C" "699680.A" "703874.A" "703874.T" "711720.C" "711720.A"
## [115] "725241.C" "725241.A" "745747.C" "745747.T" "839278.C" "839278.G"
## [121] "858195.A" "858195.C" "858669.T" "858669.C" "895244.G" "895244.A"
## [127] "899332.G" "899332.A" "900589.T" "900589.C" "913028.T" "913028.A"
## [133] "939029.G" "939029.A" "959209.G" "959209.C" "961471.T" "961471.A"
## [139] "962082.A" "962082.T" "973543.G" "973543.C"
## visualizing results


## Manhattan plot with causal loci
 
qvalues <- fly.qv1
qvalues.chr2 <- fly.qv1.chr2

row <- as.data.frame(qvalues)
str(row)
## 'data.frame':    3100 obs. of  1 variable:
##  $ PC2: num  0.98 0.98 0.954 0.954 0.98 ...
rownames(row) <- 1:nrow(row)

row.chr2 <- as.data.frame(qvalues.chr2)
str(row.chr2)
## 'data.frame':    1862322 obs. of  1 variable:
##  $ PC2: num  0.995 0.995 0.995 0.995 0.995 ...
rownames(row.chr2) <- 1:nrow(row.chr2)


(causal.set <- as.numeric(rownames(row)[which(row < 0.05)]))
## [1] 1103 1104 1731 1732
(fly.FDR <- gsub("[^0-9]", "", fly.FDR))
## [1] "162922" "162922" "163269" "163269"
head(afis[afis$variant.id == fly.FDR, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2) # One of the SNPs should be checked carefully as the position on chromosome 2L does not match the annotations. SNP163269 is at pos 3637803, which falls within the for gene. According to FlyBase, the gene mir-4972 stem loop is referred to Dmel\mir-4972 (CR43508, FBgn0263523). It is a miRNA_gene from Dmel. It has 3 annotated transcripts. Gene sequence location is 2L:3636967..3637073. Its molecular function is unknown. The biological processes in which it is involved are not known. No alleles are reported. [https://flybase.org/reports/FBgn0263523](https://flybase.org/reports/FBgn0263523)
##    variant.id                   col     gene chr     pos genotype   type
## 1:     162922 upstream_gene_variant      for  2L 3632475      C,A pooled
## 2:     163269 upstream_gene_variant mir-4972  2L 3637803      T,C pooled
##      stationId
## 1: AUM00011389
## 2: AUM00011389
(causal.set.chr2 <- as.numeric(rownames(row.chr2)[which(row.chr2 < 0.05)]))
##   [1]    6167    6168   25903   25904   39399   39400   52115   52116   71955
##  [10]   71956   77599   77600   77603   77604   81655   81656  115531  115532
##  [19]  143587  143588  157179  157180  161669  161670  176903  176904  195495
##  [28]  195496  196697  196698  204731  204732  224215  224216  247579  247580
##  [37]  301775  301776  313105  313106  315465  315466  330607  330608  391487
##  [46]  391488  414733  414734  459215  459216  500307  500308  501029  501030
##  [55]  539587  539588  543467  543468  543495  543496  570615  570616  573025
##  [64]  573026  622327  622328  640403  640404  642081  642082  695723  695724
##  [73]  714473  714474  716593  716594  766177  766178  776383  776384  785203
##  [82]  785204  785209  785210  844605  844606  854627  854628  932785  932786
##  [91]  978685  978686  978687  978688  986719  986720 1051787 1051788 1093387
## [100] 1093388 1184187 1184188 1184971 1184972 1198467 1198468 1217981 1217982
## [109] 1296481 1296482 1304355 1304356 1318903 1318904 1344395 1344396 1383145
## [118] 1383146 1559575 1559576 1595465 1595466 1596355 1596356 1666265 1666266
## [127] 1674137 1674138 1676511 1676512 1699993 1699994 1748885 1748886 1787129
## [136] 1787130 1791379 1791380 1792525 1792526 1814505 1814506
(fly.FDR.chr2 <- gsub("[^0-9]", "", fly.FDR.chr2))
##   [1] "3205"   "3205"   "13746"  "13746"  "20886"  "20886"  "27909"  "27909" 
##   [9] "38816"  "38816"  "42047"  "42047"  "42049"  "42049"  "44237"  "44237" 
##  [17] "62254"  "62254"  "77605"  "77605"  "84995"  "84995"  "87409"  "87409" 
##  [25] "95522"  "95522"  "105438" "105438" "106073" "106073" "110380" "110380"
##  [33] "120960" "120960" "133844" "133844" "163269" "163269" "169348" "169348"
##  [41] "170587" "170587" "178736" "178736" "211689" "211689" "224757" "224757"
##  [49] "249489" "249489" "271578" "271578" "271957" "271957" "293046" "293046"
##  [57] "295197" "295197" "295212" "295212" "309911" "309911" "311205" "311205"
##  [65] "337949" "337949" "347923" "347923" "348847" "348847" "377565" "377565"
##  [73] "387711" "387711" "388835" "388835" "415369" "415369" "420886" "420886"
##  [81] "425629" "425629" "425633" "425633" "457451" "457451" "462795" "462795"
##  [89] "504846" "504846" "529356" "529356" "529357" "529357" "533630" "533630"
##  [97] "568526" "568526" "590620" "590620" "639512" "639512" "639937" "639937"
## [105] "647272" "647272" "657788" "657788" "699680" "699680" "703874" "703874"
## [113] "711720" "711720" "725241" "725241" "745747" "745747" "839278" "839278"
## [121] "858195" "858195" "858669" "858669" "895244" "895244" "899332" "899332"
## [129] "900589" "900589" "913028" "913028" "939029" "939029" "959209" "959209"
## [137] "961471" "961471" "962082" "962082" "973543" "973543"
head(afis[afis$variant.id == fly.FDR.chr2, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)
##    variant.id                   col     gene chr     pos genotype        type
## 1:     163269 upstream_gene_variant mir-4972  2L 3637803      T,C inbred_line
## 2:     163269 upstream_gene_variant mir-4972  2L 3637803      T,C      pooled
##      stationId
## 1: CHM00054511
## 2: USW00004780
data.frame(causal.set.chr2, fly.FDR.chr2)
##     causal.set.chr2 fly.FDR.chr2
## 1              6167         3205
## 2              6168         3205
## 3             25903        13746
## 4             25904        13746
## 5             39399        20886
## 6             39400        20886
## 7             52115        27909
## 8             52116        27909
## 9             71955        38816
## 10            71956        38816
## 11            77599        42047
## 12            77600        42047
## 13            77603        42049
## 14            77604        42049
## 15            81655        44237
## 16            81656        44237
## 17           115531        62254
## 18           115532        62254
## 19           143587        77605
## 20           143588        77605
## 21           157179        84995
## 22           157180        84995
## 23           161669        87409
## 24           161670        87409
## 25           176903        95522
## 26           176904        95522
## 27           195495       105438
## 28           195496       105438
## 29           196697       106073
## 30           196698       106073
## 31           204731       110380
## 32           204732       110380
## 33           224215       120960
## 34           224216       120960
## 35           247579       133844
## 36           247580       133844
## 37           301775       163269
## 38           301776       163269
## 39           313105       169348
## 40           313106       169348
## 41           315465       170587
## 42           315466       170587
## 43           330607       178736
## 44           330608       178736
## 45           391487       211689
## 46           391488       211689
## 47           414733       224757
## 48           414734       224757
## 49           459215       249489
## 50           459216       249489
## 51           500307       271578
## 52           500308       271578
## 53           501029       271957
## 54           501030       271957
## 55           539587       293046
## 56           539588       293046
## 57           543467       295197
## 58           543468       295197
## 59           543495       295212
## 60           543496       295212
## 61           570615       309911
## 62           570616       309911
## 63           573025       311205
## 64           573026       311205
## 65           622327       337949
## 66           622328       337949
## 67           640403       347923
## 68           640404       347923
## 69           642081       348847
## 70           642082       348847
## 71           695723       377565
## 72           695724       377565
## 73           714473       387711
## 74           714474       387711
## 75           716593       388835
## 76           716594       388835
## 77           766177       415369
## 78           766178       415369
## 79           776383       420886
## 80           776384       420886
## 81           785203       425629
## 82           785204       425629
## 83           785209       425633
## 84           785210       425633
## 85           844605       457451
## 86           844606       457451
## 87           854627       462795
## 88           854628       462795
## 89           932785       504846
## 90           932786       504846
## 91           978685       529356
## 92           978686       529356
## 93           978687       529357
## 94           978688       529357
## 95           986719       533630
## 96           986720       533630
## 97          1051787       568526
## 98          1051788       568526
## 99          1093387       590620
## 100         1093388       590620
## 101         1184187       639512
## 102         1184188       639512
## 103         1184971       639937
## 104         1184972       639937
## 105         1198467       647272
## 106         1198468       647272
## 107         1217981       657788
## 108         1217982       657788
## 109         1296481       699680
## 110         1296482       699680
## 111         1304355       703874
## 112         1304356       703874
## 113         1318903       711720
## 114         1318904       711720
## 115         1344395       725241
## 116         1344396       725241
## 117         1383145       745747
## 118         1383146       745747
## 119         1559575       839278
## 120         1559576       839278
## 121         1595465       858195
## 122         1595466       858195
## 123         1596355       858669
## 124         1596356       858669
## 125         1666265       895244
## 126         1666266       895244
## 127         1674137       899332
## 128         1674138       899332
## 129         1676511       900589
## 130         1676512       900589
## 131         1699993       913028
## 132         1699994       913028
## 133         1748885       939029
## 134         1748886       939029
## 135         1787129       959209
## 136         1787130       959209
## 137         1791379       961471
## 138         1791380       961471
## 139         1792525       962082
## 140         1792526       962082
## 141         1814505       973543
## 142         1814506       973543
#snps.sel.chr2 <- unique(fly.FDR.chr2)
#save(snps.sel.chr2, file = "snps.sel.chr2")

quartz()

layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                0, 2, 2, 0,
                0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))


plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 2), main = "for")
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = alpha("red", 0.3))
text(causal.set, (-log10(qvalues)[causal.set] - 0.1), fly.FDR, col = "red")


plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 3), main = "Chr 2L")
points(causal.set.chr2[c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]], pch = 19, col = alpha("red", 0.3))
text(causal.set.chr2[c(37, 38)], (-log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]] - 0.1), fly.FDR.chr2[c(37, 38)], col = "red", cex = 0.5)

quartz()

## png("adaptation_revision.png", width = 7, height = 7, units = "in", res = 300)

## pdf("adaptation_revision.pdf")


layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))

hist(fly.pv1$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", paste("K = 9\nGIF = ", round(fly.pv1$gif, 3)), bty = "n")
mtext("(a)", side = 2, at = 365, line = 2.5, las = 1, font = 2, family = "serif")

plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("gray", 0.6), xlab = "SNP", las = 1, ylim = c(0, 2))
points(causal.set, -log10(qvalues)[causal.set], pch = 19, col = "black")
text(causal.set, (-log10(qvalues)[causal.set] - 0.15), fly.FDR, col = "black")

hist(fly.pv1.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 150000))
legend("top", paste("K = 8\nGIF = ", round(fly.pv1.chr2$gif, 3)), bty = "n")
mtext("(b)", side = 2, at = 158010, line = 4, las = 1, font = 2, family = "serif")

plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, ylim = c(0, 3), main = "")
points(causal.set.chr2[-c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[-c(37, 38)]], pch = 19, col = "black")
points(causal.set.chr2[c(37, 38)], -log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]], pch = 19, col = "red")
text(causal.set.chr2[c(37, 38)], (-log10(qvalues.chr2)[causal.set.chr2[c(37, 38)]] - 0.13), fly.FDR.chr2[c(37, 38)], col = "red", cex = 0.8)

## dev.off()

Figure 5. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor, and the for loci (SNPs) potentially affected by the PC2 predictor variable. Likewise, (b) displays the same analysis for the entire chromosome 2L. Bold dots represent loci considered to be under selection according to a False Discovery Rate of 0.05.